#Loading Libraries
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(boot)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lmboot)
library(lattice)
##
## Attaching package: 'lattice'
##
## The following object is masked from 'package:boot':
##
## melanoma
library(caret)
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(naniar)
library(utils)
library(stats)
##Reading in Dataset
setwd("/Users/xaviermojica/Desktop/Stats2/Project1") #/Users/xaviermojica/
life = read.csv("Life Expectancy Data (1).csv")
ggplot(data = life) + geom_point(mapping = aes(x = GDP, y = Life.expectancy))
## Warning: Removed 453 rows containing missing values (`geom_point()`).
##Upon looking at the graph of the original data set, it appears that
there needs to be a log transformation on the X or the GDP as we are
interested in seeing the relation between Life Expenctancy and GDP.
##Checking Data Types
str(life)
## 'data.frame': 2938 obs. of 22 variables:
## $ Country : chr "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
## $ Year : int 2015 2014 2013 2012 2011 2010 2009 2008 2007 2006 ...
## $ Status : chr "Developing" "Developing" "Developing" "Developing" ...
## $ Life.expectancy : num 65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
## $ Adult.Mortality : int 263 271 268 272 275 279 281 287 295 295 ...
## $ infant.deaths : int 62 64 66 69 71 74 77 80 82 84 ...
## $ Alcohol : num 0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
## $ percentage.expenditure : num 71.3 73.5 73.2 78.2 7.1 ...
## $ Hepatitis.B : int 65 62 64 67 68 66 63 64 63 64 ...
## $ Measles : int 1154 492 430 2787 3013 1989 2861 1599 1141 1990 ...
## $ BMI : num 19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
## $ under.five.deaths : int 83 86 89 93 97 102 106 110 113 116 ...
## $ Polio : int 6 58 62 67 68 66 63 64 63 58 ...
## $ Total.expenditure : num 8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
## $ Diphtheria : int 65 62 64 67 68 66 63 64 63 58 ...
## $ HIV.AIDS : num 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
## $ GDP : num 584.3 612.7 631.7 670 63.5 ...
## $ Population : num 33736494 327582 31731688 3696958 2978599 ...
## $ thinness..1.19.years : num 17.2 17.5 17.7 17.9 18.2 18.4 18.6 18.8 19 19.2 ...
## $ thinness.5.9.years : num 17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
## $ Income.composition.of.resources: num 0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
## $ Schooling : num 10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
vis_miss(life)
dim(life)
## [1] 2938 22
View(life)
#Imputing using Median
#GDP 15% [17] keep GDP to have it Imputed even if quite high percentage, assuming it is crucial to predicting Life.expectancy as richer countries have better health access/Medicine and tech. The numbers appear to be GDP per capita which helps as it addresses GDP/Population. GDP per Capita and Population would be too closely related and prob attribute to covariance.
#Adjusting text angle to vis_miss
imputeMedian= preProcess(life[,-c(1:4,9)],method="medianImpute") #predictors 1:4, 9 and response is 4
cleandataMedian = predict(imputeMedian,newdata=life)
dim(cleandataMedian)
## [1] 2938 22
vis_miss(cleandataMedian) + theme(axis.text.x = element_text(angle = 90, hjust = 0))
#Literature says that over 10% missing data can contribute to bias
#HepatitsB [9] at 19% , Population 22% [18].
#Removing columns 9 and 18
cleandataMedian = cleandataMedian[,-c(18,9)]
vis_miss(cleandataMedian) + theme(axis.text.x = element_text(angle = 90, hjust = 0))
#removing last NA
cleandataMedian = na.omit(cleandataMedian)
vis_miss(cleandataMedian) + theme(axis.text.x = element_text(angle = 90, hjust = 0))
ggplot(data = cleandataMedian) + geom_point(mapping = aes(x = GDP, y = Life.expectancy))
#Converting GDP to Log
cleandataMedian$logGDP = log(cleandataMedian$GDP)
#converting Life.expectancy to log
cleandataMedian$logLife.expectancy = log(cleandataMedian$Life.expectancy)
#Log transformation on GDP
ggplot(data = cleandataMedian) + geom_point(mapping = aes(x = logGDP, y = logLife.expectancy))
#Split Data
set.seed(1234)
trainIndex<-createDataPartition(cleandataMedian$Life.expectancy,p=.8,list=F)
training<-cleandataMedian[trainIndex,]
validate<-cleandataMedian[-trainIndex,]
#Multivariable Plots on Full Data
library(ISLR)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(GGally)
library(ggplot2)
ggpairs(training[,4:8], lower = list(continuous = wrap("points", color = "red", alpha = 0.5), combo = wrap("box", color = "orange", alpha = 0.3), discrete = wrap("facetbar", color = "yellow", alpha = 0.3) ),diag = list(continuous = wrap("densityDiag", color = "blue", alpha = 0.5)))
ggpairs(training[,c(4,9:12)], lower = list(continuous = wrap("points", color = "red", alpha = 0.5), combo = wrap("box", color = "orange", alpha = 0.3), discrete = wrap("facetbar", color = "yellow", alpha = 0.3) ),diag = list(continuous = wrap("densityDiag", color = "blue", alpha = 0.5)))
ggpairs(training[,c(4,13:16)], lower = list(continuous = wrap("points", color = "red", alpha = 0.5), combo = wrap("box", color = "orange", alpha = 0.3), discrete = wrap("facetbar", color = "yellow", alpha = 0.3) ),diag = list(continuous = wrap("densityDiag", color = "blue", alpha = 0.5)))
ggpairs(training[,c(4,17:20)], lower = list(continuous = wrap("points", color = "red", alpha = 0.5), combo = wrap("box", color = "orange", alpha = 0.3), discrete = wrap("facetbar", color = "yellow", alpha = 0.3) ),diag = list(continuous = wrap("densityDiag", color = "blue", alpha = 0.5)))
#ggpairs(cleandataMedian[,5:22], upper = list(continuous = wrap("cor", size = 4.75, align_percent = 1)))
#ggscatmat(cleandataMedian, columns = 4:10)
#Residuals for chosen MLR model
set.seed(2345)
eightVar = lm(Life.expectancy~HIV.AIDS+Schooling+Alcohol+BMI+Polio+Diphtheria+logGDP+thinness..1.19.years+Income.composition.of.resources, data = training)
summary(eightVar)
##
## Call:
## lm(formula = Life.expectancy ~ HIV.AIDS + Schooling + Alcohol +
## BMI + Polio + Diphtheria + logGDP + thinness..1.19.years +
## Income.composition.of.resources, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.4141 -2.5857 0.1184 2.7623 19.2743
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.206803 0.625395 69.087 < 2e-16 ***
## HIV.AIDS -0.681485 0.019905 -34.236 < 2e-16 ***
## Schooling 0.807629 0.055413 14.575 < 2e-16 ***
## Alcohol 0.048487 0.030055 1.613 0.107
## BMI 0.055012 0.006301 8.731 < 2e-16 ***
## Polio 0.029326 0.005694 5.151 2.82e-07 ***
## Diphtheria 0.048879 0.005683 8.601 < 2e-16 ***
## logGDP 0.579798 0.070900 8.178 4.69e-16 ***
## thinness..1.19.years -0.113199 0.027585 -4.104 4.21e-05 ***
## Income.composition.of.resources 7.810187 0.825457 9.462 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.657 on 2334 degrees of freedom
## Multiple R-squared: 0.7607, Adjusted R-squared: 0.7598
## F-statistic: 824.6 on 9 and 2334 DF, p-value: < 2.2e-16
confint(eightVar)
## 2.5 % 97.5 %
## (Intercept) 41.98041532 44.43319143
## HIV.AIDS -0.72051911 -0.64245093
## Schooling 0.69896506 0.91629379
## Alcohol -0.01045042 0.10742491
## BMI 0.04265642 0.06736755
## Polio 0.01816053 0.04049132
## Diphtheria 0.03773517 0.06002214
## logGDP 0.44076547 0.71883070
## thinness..1.19.years -0.16729240 -0.05910629
## Income.composition.of.resources 6.19148283 9.42889208
#Visuals for Residuals
plot(eightVar)
#Bests MLR
set.seed(2323)
lessvar = lm(Life.expectancy~HIV.AIDS+Schooling+BMI+Diphtheria+logGDP+thinness..1.19.years+Income.composition.of.resources, data = training)
summary(lessvar)
##
## Call:
## lm(formula = Life.expectancy ~ HIV.AIDS + Schooling + BMI + Diphtheria +
## logGDP + thinness..1.19.years + Income.composition.of.resources,
## data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.382 -2.545 0.095 2.768 20.119
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.642032 0.618828 70.524 < 2e-16 ***
## HIV.AIDS -0.680044 0.019876 -34.215 < 2e-16 ***
## Schooling 0.844081 0.054243 15.561 < 2e-16 ***
## BMI 0.056115 0.006332 8.862 < 2e-16 ***
## Diphtheria 0.067500 0.004437 15.211 < 2e-16 ***
## logGDP 0.599579 0.070806 8.468 < 2e-16 ***
## thinness..1.19.years -0.123684 0.026922 -4.594 4.58e-06 ***
## Income.composition.of.resources 7.952176 0.829769 9.584 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.684 on 2336 degrees of freedom
## Multiple R-squared: 0.7578, Adjusted R-squared: 0.757
## F-statistic: 1044 on 7 and 2336 DF, p-value: < 2.2e-16
confint(lessvar)
## 2.5 % 97.5 %
## (Intercept) 42.42852263 44.85554107
## HIV.AIDS -0.71901981 -0.64106761
## Schooling 0.73771169 0.95044987
## BMI 0.04369781 0.06853138
## Diphtheria 0.05879867 0.07620224
## logGDP 0.46073069 0.73842704
## thinness..1.19.years -0.17647798 -0.07089008
## Income.composition.of.resources 6.32501538 9.57933583
#Visuals for Residuals
plot(lessvar)
#Forward, Backward, Stepwise Selection
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
#Full Model
set.seed(1246)
fitFull = lm(Life.expectancy~Adult.Mortality +
infant.deaths + Alcohol + percentage.expenditure + Measles +
BMI + under.five.deaths + Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + thinness.5.9.years + Income.composition.of.resources + Schooling + logGDP,training)
stepup = stepAIC(fitFull, direction = "forward", steps = 2000)
## Start: AIC=6586.08
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol +
## percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## thinness.5.9.years + Income.composition.of.resources + Schooling +
## logGDP
stepdown= stepAIC(fitFull, direction = "backward", steps = 2000)
## Start: AIC=6586.08
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol +
## percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## thinness.5.9.years + Income.composition.of.resources + Schooling +
## logGDP
##
## Df Sum of Sq RSS AIC
## - thinness.5.9.years 1 9.3 38373 6584.6
## <none> 38364 6586.1
## - thinness..1.19.years 1 64.6 38428 6588.0
## - Measles 1 85.5 38449 6589.3
## - Total.expenditure 1 115.3 38479 6591.1
## - percentage.expenditure 1 209.3 38573 6596.8
## - Alcohol 1 227.5 38591 6597.9
## - Polio 1 360.2 38724 6606.0
## - logGDP 1 488.4 38852 6613.7
## - Diphtheria 1 869.0 39233 6636.6
## - BMI 1 884.3 39248 6637.5
## - Income.composition.of.resources 1 997.9 39362 6644.3
## - infant.deaths 1 1914.8 40279 6698.2
## - under.five.deaths 1 1950.0 40314 6700.3
## - Schooling 1 3043.1 41407 6763.0
## - HIV.AIDS 1 8282.4 46646 7042.3
## - Adult.Mortality 1 9123.8 47488 7084.2
##
## Step: AIC=6584.64
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol +
## percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## Income.composition.of.resources + Schooling + logGDP
##
## Df Sum of Sq RSS AIC
## <none> 38373 6584.6
## - Measles 1 88.3 38461 6588.0
## - Total.expenditure 1 111.9 38485 6589.5
## - thinness..1.19.years 1 125.1 38498 6590.3
## - percentage.expenditure 1 209.3 38582 6595.4
## - Alcohol 1 225.6 38599 6596.4
## - Polio 1 360.1 38733 6604.5
## - logGDP 1 482.6 38856 6611.9
## - Diphtheria 1 873.4 39246 6635.4
## - BMI 1 875.2 39248 6635.5
## - Income.composition.of.resources 1 998.5 39372 6642.9
## - infant.deaths 1 1925.1 40298 6697.4
## - under.five.deaths 1 1955.5 40329 6699.2
## - Schooling 1 3052.4 41425 6762.1
## - HIV.AIDS 1 8273.2 46646 7040.3
## - Adult.Mortality 1 9114.8 47488 7082.2
stepboth = stepAIC(fitFull, direction = "both", steps = 2000)
## Start: AIC=6586.08
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol +
## percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## thinness.5.9.years + Income.composition.of.resources + Schooling +
## logGDP
##
## Df Sum of Sq RSS AIC
## - thinness.5.9.years 1 9.3 38373 6584.6
## <none> 38364 6586.1
## - thinness..1.19.years 1 64.6 38428 6588.0
## - Measles 1 85.5 38449 6589.3
## - Total.expenditure 1 115.3 38479 6591.1
## - percentage.expenditure 1 209.3 38573 6596.8
## - Alcohol 1 227.5 38591 6597.9
## - Polio 1 360.2 38724 6606.0
## - logGDP 1 488.4 38852 6613.7
## - Diphtheria 1 869.0 39233 6636.6
## - BMI 1 884.3 39248 6637.5
## - Income.composition.of.resources 1 997.9 39362 6644.3
## - infant.deaths 1 1914.8 40279 6698.2
## - under.five.deaths 1 1950.0 40314 6700.3
## - Schooling 1 3043.1 41407 6763.0
## - HIV.AIDS 1 8282.4 46646 7042.3
## - Adult.Mortality 1 9123.8 47488 7084.2
##
## Step: AIC=6584.64
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol +
## percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## Income.composition.of.resources + Schooling + logGDP
##
## Df Sum of Sq RSS AIC
## <none> 38373 6584.6
## + thinness.5.9.years 1 9.3 38364 6586.1
## - Measles 1 88.3 38461 6588.0
## - Total.expenditure 1 111.9 38485 6589.5
## - thinness..1.19.years 1 125.1 38498 6590.3
## - percentage.expenditure 1 209.3 38582 6595.4
## - Alcohol 1 225.6 38599 6596.4
## - Polio 1 360.1 38733 6604.5
## - logGDP 1 482.6 38856 6611.9
## - Diphtheria 1 873.4 39246 6635.4
## - BMI 1 875.2 39248 6635.5
## - Income.composition.of.resources 1 998.5 39372 6642.9
## - infant.deaths 1 1925.1 40298 6697.4
## - under.five.deaths 1 1955.5 40329 6699.2
## - Schooling 1 3052.4 41425 6762.1
## - HIV.AIDS 1 8273.2 46646 7040.3
## - Adult.Mortality 1 9114.8 47488 7082.2
#summary for each model
up = summary(stepup)
up
##
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths +
## Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## thinness.5.9.years + Income.composition.of.resources + Schooling +
## logGDP, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.0294 -2.1028 0.0757 2.3710 16.0441
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.188e+01 6.977e-01 74.359 < 2e-16 ***
## Adult.Mortality -2.091e-02 8.887e-04 -23.525 < 2e-16 ***
## infant.deaths 9.970e-02 9.251e-03 10.777 < 2e-16 ***
## Alcohol 1.017e-01 2.737e-02 3.715 0.000208 ***
## percentage.expenditure 1.944e-04 5.455e-05 3.563 0.000374 ***
## Measles -1.934e-05 8.491e-06 -2.277 0.022848 *
## BMI 4.111e-02 5.613e-03 7.324 3.31e-13 ***
## under.five.deaths -7.429e-02 6.831e-03 -10.876 < 2e-16 ***
## Polio 2.327e-02 4.977e-03 4.674 3.12e-06 ***
## Total.expenditure 9.986e-02 3.776e-02 2.644 0.008237 **
## Diphtheria 3.639e-02 5.012e-03 7.260 5.25e-13 ***
## HIV.AIDS -4.473e-01 1.996e-02 -22.414 < 2e-16 ***
## thinness..1.19.years -1.103e-01 5.568e-02 -1.980 0.047825 *
## thinness.5.9.years 4.121e-02 5.495e-02 0.750 0.453338
## Income.composition.of.resources 5.677e+00 7.297e-01 7.780 1.08e-14 ***
## Schooling 6.622e-01 4.874e-02 13.586 < 2e-16 ***
## logGDP 3.792e-01 6.968e-02 5.443 5.80e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.06 on 2327 degrees of freedom
## Multiple R-squared: 0.8186, Adjusted R-squared: 0.8174
## F-statistic: 656.5 on 16 and 2327 DF, p-value: < 2.2e-16
down = summary(stepdown)
down
##
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths +
## Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## Income.composition.of.resources + Schooling + logGDP, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.0079 -2.1041 0.0671 2.3681 16.0263
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.193e+01 6.942e-01 74.809 < 2e-16 ***
## Adult.Mortality -2.089e-02 8.883e-04 -23.515 < 2e-16 ***
## infant.deaths 9.992e-02 9.246e-03 10.807 < 2e-16 ***
## Alcohol 1.012e-01 2.736e-02 3.700 0.000221 ***
## percentage.expenditure 1.944e-04 5.454e-05 3.564 0.000373 ***
## Measles -1.963e-05 8.482e-06 -2.314 0.020753 *
## BMI 4.061e-02 5.573e-03 7.287 4.32e-13 ***
## under.five.deaths -7.439e-02 6.830e-03 -10.892 < 2e-16 ***
## Polio 2.326e-02 4.977e-03 4.674 3.12e-06 ***
## Total.expenditure 9.821e-02 3.769e-02 2.606 0.009231 **
## Diphtheria 3.647e-02 5.010e-03 7.279 4.57e-13 ***
## HIV.AIDS -4.469e-01 1.995e-02 -22.403 < 2e-16 ***
## thinness..1.19.years -7.361e-02 2.672e-02 -2.755 0.005920 **
## Income.composition.of.resources 5.679e+00 7.296e-01 7.783 1.05e-14 ***
## Schooling 6.630e-01 4.872e-02 13.608 < 2e-16 ***
## logGDP 3.765e-01 6.957e-02 5.411 6.91e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.06 on 2328 degrees of freedom
## Multiple R-squared: 0.8186, Adjusted R-squared: 0.8174
## F-statistic: 700.3 on 15 and 2328 DF, p-value: < 2.2e-16
both = summary(stepboth)
both
##
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths +
## Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## Income.composition.of.resources + Schooling + logGDP, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.0079 -2.1041 0.0671 2.3681 16.0263
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.193e+01 6.942e-01 74.809 < 2e-16 ***
## Adult.Mortality -2.089e-02 8.883e-04 -23.515 < 2e-16 ***
## infant.deaths 9.992e-02 9.246e-03 10.807 < 2e-16 ***
## Alcohol 1.012e-01 2.736e-02 3.700 0.000221 ***
## percentage.expenditure 1.944e-04 5.454e-05 3.564 0.000373 ***
## Measles -1.963e-05 8.482e-06 -2.314 0.020753 *
## BMI 4.061e-02 5.573e-03 7.287 4.32e-13 ***
## under.five.deaths -7.439e-02 6.830e-03 -10.892 < 2e-16 ***
## Polio 2.326e-02 4.977e-03 4.674 3.12e-06 ***
## Total.expenditure 9.821e-02 3.769e-02 2.606 0.009231 **
## Diphtheria 3.647e-02 5.010e-03 7.279 4.57e-13 ***
## HIV.AIDS -4.469e-01 1.995e-02 -22.403 < 2e-16 ***
## thinness..1.19.years -7.361e-02 2.672e-02 -2.755 0.005920 **
## Income.composition.of.resources 5.679e+00 7.296e-01 7.783 1.05e-14 ***
## Schooling 6.630e-01 4.872e-02 13.608 < 2e-16 ***
## logGDP 3.765e-01 6.957e-02 5.411 6.91e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.06 on 2328 degrees of freedom
## Multiple R-squared: 0.8186, Adjusted R-squared: 0.8174
## F-statistic: 700.3 on 15 and 2328 DF, p-value: < 2.2e-16
fitFull
##
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths +
## Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths +
## Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years +
## thinness.5.9.years + Income.composition.of.resources + Schooling +
## logGDP, data = training)
##
## Coefficients:
## (Intercept) Adult.Mortality
## 5.188e+01 -2.091e-02
## infant.deaths Alcohol
## 9.970e-02 1.017e-01
## percentage.expenditure Measles
## 1.944e-04 -1.934e-05
## BMI under.five.deaths
## 4.111e-02 -7.429e-02
## Polio Total.expenditure
## 2.327e-02 9.986e-02
## Diphtheria HIV.AIDS
## 3.639e-02 -4.473e-01
## thinness..1.19.years thinness.5.9.years
## -1.103e-01 4.121e-02
## Income.composition.of.resources Schooling
## 5.677e+00 6.622e-01
## logGDP
## 3.792e-01
olsrr::ols_step_forward_aic(fitFull)
##
## Selection Summary
## ------------------------------------------------------------------------------------------------
## Variable AIC Sum Sq RSS R-Sq Adj. R-Sq
## ------------------------------------------------------------------------------------------------
## Schooling 15530.008 108310.949 103218.899 0.51204 0.51183
## Adult.Mortality 14375.533 148508.373 63021.474 0.70207 0.70181
## HIV.AIDS 13971.630 158528.968 53000.879 0.74944 0.74912
## Diphtheria 13721.013 163943.928 47585.920 0.77504 0.77465
## BMI 13591.942 166531.793 44998.054 0.78727 0.78682
## Income.composition.of.resources 13493.362 168421.796 43108.051 0.79621 0.79568
## logGDP 13432.779 169557.522 41972.325 0.80158 0.80098
## Polio 13408.221 170030.391 41499.456 0.80381 0.80314
## thinness..1.19.years 13387.220 170435.632 41094.216 0.80573 0.80498
## Measles 13372.433 170728.879 40800.969 0.80711 0.80629
## percentage.expenditure 13360.849 170964.653 40565.194 0.80823 0.80732
## Total.expenditure 13353.576 171124.817 40405.030 0.80899 0.80800
## Alcohol 13351.222 171199.787 40330.061 0.80934 0.80828
## ------------------------------------------------------------------------------------------------
#KNN
#library(caret)
#fit_cont = trainControl(method = "repeatedcv", number = 10, repeats = 1)
#set.seed(136)
#knnfit = train(Life.expectancy~Adult.Mortality + infant.deaths + Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths + Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + thinness.5.9.years + Income.composition.of.resources + Schooling + logGDP, data =cleandataMedian, method = "knn", trControl = fit_cont, tuneGrid = expand.grid(k = c(1:30)))
#knnfit
#plot(knnfit)
#knn with training
library(caret)
fit_cont1 = trainControl(method = "repeatedcv", number = 10, repeats = 1)
set.seed(1364)
knnfit1 = train(Life.expectancy~Adult.Mortality + infant.deaths + Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths + Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + thinness.5.9.years + Income.composition.of.resources + Schooling + logGDP, data =training, method = "knn", trControl = fit_cont1, tuneGrid = expand.grid(k = c(1:30)))
plot(knnfit1)
updateval = validate[,c("Life.expectancy", "Adult.Mortality", "infant.deaths", "Alcohol", "percentage.expenditure", "Measles", "BMI", "under.five.deaths", "Polio", "Total.expenditure", "Diphtheria", "HIV.AIDS", "thinness..1.19.years","thinness.5.9.years", "Income.composition.of.resources", "Schooling", "logGDP")]
prediction = predict(knnfit1, newdata = validate)
#cf = confusionMatrix(prediction, updateval$Life.expectancy)
#cf
#print(cf)
#World Map, Color plotting
```r
#Creating the World
#library(ggplot2)
#library(tidyverse)
#library(ggthemes)
#world_map = map_data("world") %>% filter(! long > 180)
#countries = world_map %>% distinct(region) %>% rowid_to_column()
#countries %>% ggplot(aes(fill = rowid, map_id = region)) + geom_map(map = world_map) + expand_limits(x = world_map$long, y = world_map$lat) + coord_map("moll") +theme_map()
#Color world plotting with full data
library(ggplot2)
library(tidyverse)
#rename training for the map
dataforcolmap = training
#but first renaming column Country in dataforcolmap
colnames(dataforcolmap)[1] = "region"
view(dataforcolmap)
#getting map data for plotting
mapdata = map_data("world")
view(mapdata)
#joining map data with dataforcolmap
mapdata = left_join(mapdata,dataforcolmap, by = "region")
## Warning in left_join(mapdata, dataforcolmap, by = "region"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 11 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
view(mapdata)
#filtering out NAs for life expectancy , status, Income.composition.of.resources
#Life Exp
mapdata1 = mapdata %>% filter(!is.na(mapdata$Life.expectancy))
#Status
mapdata2 = mapdata %>% filter(!is.na(mapdata$Status))
#Income
mapdata3 = mapdata %>% filter(!is.na(mapdata$Income.composition.of.resources))
#mapping mapdata1 for Life Exp
map1 = ggplot(mapdata1, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = Life.expectancy), color = "black")+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), axis.title.y = element_blank(), axis.title.x = element_blank()) + ggtitle("Life Expectancy per Country") + scale_fill_gradient(low = "red", high = "yellow") + guides(fill=guide_legend(title="Life Expectancy"))
map1
#mapping mapdata2 for Status
mapStatus = ggplot(mapdata2, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = Status, col = "orange"), color = "black") + theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), axis.title.y = element_blank(), axis.title.x = element_blank()) + ggtitle("Country's Status: Developed v. Developing")
mapStatus
#mapping mapdata3 for Income Composition of Resources
mapIncome = ggplot(mapdata3, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = Income.composition.of.resources), color = "black")+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), axis.title.y = element_blank(), axis.title.x = element_blank()) + ggtitle("Income Composition of Resources per Country") + scale_fill_gradient(low = "red", high = "yellow")+ guides(fill=guide_legend(title="Income Composition of Resources"))
mapIncome
# Using rpart library
#treeimb <- rpart(ExplVar ~ ., data = train)
#pred.treeimb <- predict(treeimb, newdata = test)